HW-02

Author

Trevor Macdonald

Code
# Load and install essential libraries using pacman
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse, here, ggrepel, ggthemes, scales, waffle, countdown,
  openintro, patchwork, ggpmisc, ggridges, dsbox, fs,
  janitor, ggtext, palmerpenguins
)

# Set the default ggplot2 theme for all plots
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# Set global console output width
options(width = 65)

# Set global chunk options for knitting
knitr::opts_chunk$set(
  fig.width = 7,
  fig.asp = 0.618,
  fig.retina = 3,
  fig.align = "center",
  dpi = 300,
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  error = FALSE
)

1 - A new day, a new plot, a new geom

Code
#| label: Load Data 

#Load data
edibnb <- dsbox::edibnb

# Qick view of tibble
edibnb |>
  slice_head(n = 10)
# A tibble: 10 × 10
      id price neighbourhood accommodates bathrooms bedrooms
   <dbl> <dbl> <chr>                <dbl>     <dbl>    <dbl>
 1 15420    80 New Town                 2       1          1
 2 24288   115 Southside                4       1.5        2
 3 38628    46 <NA>                     2       1          0
 4 44552    32 Leith                    2       1          1
 5 47616   100 Southside                2       1          1
 6 48645    71 Old Town                 3       1          1
 7 51505   175 New Town                 5       1          2
 8 54188   150 West End                 5       1          3
 9 55881   139 Leith                    6       1          4
10 58682   190 West End                10       2          4
# ℹ 4 more variables: beds <dbl>, review_scores_rating <dbl>,
#   number_of_reviews <dbl>, listing_url <chr>
Code
summarize(edibnb)
# A tibble: 1 × 0
Code
# View missing values
colSums(is.na(edibnb))
                  id                price        neighbourhood 
                   0                  199                 2294 
        accommodates            bathrooms             bedrooms 
                   0                   12                    4 
                beds review_scores_rating    number_of_reviews 
                  15                 2177                    0 
         listing_url 
                   0 
Code
# Specifically view target column stats
summary(edibnb$review_scores_rating)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  20.00   93.00   97.00   95.02   99.00  100.00    2177 
Code
#| label: clean

# Clean, group, and mutate tibble for analysis
edibnb_clean <- edibnb |>
  drop_na(neighbourhood, review_scores_rating) |> # Drop missing values
  group_by(neighbourhood) |>
  mutate(
    median_score = median(review_scores_rating)
  ) |>
  ungroup() |>
  mutate(
    neighbourhood = fct_reorder(neighbourhood, median_score) # Oder tibble for plot
  )

#example graph
#ggplot(edibnb_clean, aes(x = review_scores_rating, y = neighbourhood)) +
  #geom_density_ridges()
Code
#| label: Airbnb Plot
# Plot object
ggplot(edibnb_clean, aes(x = review_scores_rating, y = neighbourhood, fill = neighbourhood)) +
  geom_density_ridges(
    scale = 3, 
    rel_min_height = 0.01, 
    alpha = 0.7,
    #quantiles = 2, quantile_lines = TRUE
  ) +
  scale_fill_viridis_d(option = "G", guide = "none") +

  coord_cartesian(xlim = c(50, 101)) + # upper limit at 100 cuts off part of plot

  labs(
    title = "Distributions of Edinburgh Airbnb Review Scores",
    subtitle = "Ordered by median score",
    x = "Review score (0–100)",
    y = "Neighborhood",
    caption = "Source: dsbox package edibnb dataset"
  ) 

Interpretation: The Airbnb reviews are clustered in the 90-100 range. There are few outliers, but generally the mean review is very positive. When comparing the top (Morningside) and bottom (Haymarket) neighborhoods there is a slight flattening if the distribution. This suggest more variance in the reviews despite still being clustered > 90. The graph was asymmetric when plotting the full range 0-100. I zoomed the range from 50-100 to produce a more appealing visual. It’s also important to mention that the trends are more noticeable with the amended range. The color was not necessary, but I think it adds to the visual.

2 - Foreign Connected PACs

Code
# Get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# Read all files and row bind them
# Keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year")
Code
#clean Names with janitor
pac_janitor <- pac |>
  clean_names()

pac_clean <- pac_janitor |>
  mutate(
    # Clean currency columns and drop total
    total  = NULL,
    dems   = str_remove(dems, "\\$") |> as.numeric(),
    repubs = str_remove(repubs, "\\$") |> as.numeric(),
    # Extract second year to match example
    year = str_extract(year, "(?<=-)\\d{4}") |> as.integer() 
    # This line was produced by ChatGPT I was unable to figure out how to parse the string for second set of numbers
  ) |>
  
  # Split country and parent company
  separate(
    country_of_origin_parent_company,
    into = c("country", "parent_company"),
    sep = "/",
    extra = "merge",
    fill = "right"
  )

# Pivot long format
pac_clean_long <- pac_clean |>
  pivot_longer(
    cols = c(dems, repubs),# pivot on
    names_to = "party",        
    values_to = "amount"         
  ) |>
  
  mutate(
    # Convert party names to title case, avoids adding code for plot
    party = recode(party, dems = "Democrat", repubs = "Republican")
  )

# Sum contributions by country in new tibble
country_totals <- pac_clean_long |>
  group_by(country) |>
  summarise(
    total_amount = sum(amount, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(desc(total_amount))
Code
# Filter for country 
pac_uk <- pac_clean_long |>
  filter(country == "UK") |>
  group_by(year, party) |>
  summarise(
    total_contributions = sum(amount),
    .groups = "drop"
  )

ggplot(pac_uk, aes(x = year, y = total_contributions, color = party)) +
  geom_line(size = 1) +
  scale_color_manual(
    values = c("Democrat" = "blue", "Republican" = "red")
  ) +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-6, suffix = "M")
  ) +
  labs(
    title = "Contributions to US political parties from UK-connected PACs",
    x = "Year",                           
    y = "Total amount",                   
    caption = "Source: OpenSecrets.org" 
  ) +
  theme(
    plot.title = element_text(size = 11, hjust = 0),
    plot.caption = element_text(size = 9, hjust = 1),
    axis.title.x = element_text(size = 9,hjust = 0),
    axis.title.y = element_text(size = 9,hjust = 0),
    axis.text = element_text(size = 7),
    legend.position = c(0.9, 0.15),
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 9)
  )

Code
# Filter for country 
pac_switzerland <- pac_clean_long |>
  filter(country == "Switzerland") |>
  group_by(year, party) |>
  summarise(
    total_contributions = sum(amount),
    .groups = "drop"
  )

ggplot(pac_switzerland, aes(x = year, y = total_contributions, color = party)) +
  geom_line(size = 1) +
  scale_color_manual(
    values = c("Democrat" = "blue", "Republican" = "red")
  ) +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-6, suffix = "M")
  ) +
  labs(
    title = "Contributions to US political parties from Switzerland-connected PACs",
    x = "Year",                           
    y = "Total amount",                   
    caption = "Source: OpenSecrets.org" 
  ) +
  theme(
    plot.title = element_text(size = 11, hjust = 0),
    plot.caption = element_text(size = 9, hjust = 1),
    axis.title.x = element_text(size = 9,hjust = 0),
    axis.title.y = element_text(size = 9,hjust = 0),
    axis.text = element_text(size = 7),
    legend.position = c(0.9, 0.15),
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 9)
  )

Interpretation: I sorted total contributions by country and found that Switzerland had contributed the most. There seems to be an interesting behavior in republican data after ~2016 where a trend of decreasing contributions emerges while the democratic party has sustained and even slight increase until 2020. I would be interested to know if the general political leaning of Switzerland (assumed center right after google search) is correlated to these contributions. Republicans have consistently received more since 2000 until ~2016, with an exception of ~2007 when Obama was running for office. There may be some sort of shift that took place within Switzerland once trump administration took office in 2016. The trend might warrant further investigation into american policies during that period that may have affected Switzerland.

3 - Median housing prices in the US

Code
# Load and rename medain housing data
median_housing <- read_csv("data/median-housing.csv") |>
  clean_names() |>
  rename(
    price = mspus   # change price
  )

# Load and rename recession data
recessions <- read_csv("data/recessions.csv") |>
  clean_names() |>
  rename(
    date = peak,     
    end  = trough    
  )
Code
ggplot(median_housing, aes(x = date, y = price)) +
  geom_line(color = "blue", linewidth = 0.8) +
  # Scale x
  scale_x_date(
    date_labels = "%Y",   # Show years only
    date_breaks = "5 years"
  ) +
  # Scale y
    scale_y_continuous(
    limits = c(0, 400000),                            
    breaks = seq(0, 400000, by = 40000),
    labels = scales::label_comma() # call specific package
    #coord_cartesian(ylim = c(0, 440000)),
    )+
  
  labs(
    title = "Median sales price of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Sources: Census; HUD"
  ) +
  # Formating
  theme(
    # Labels
    plot.title.position = "plot",
    plot.title = element_text(size = 14, hjust = 0),
    plot.subtitle = element_text(size = 12,hjust = 0),
    plot.caption = element_text(size = 10, hjust = 1),
    # Text size
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 12),
    axis.text.x = element_text(size = 9),
    axis.text.y = element_text(size = 9),
    # Hide tick marks
    axis.ticks.x = element_blank(),             
    panel.grid.major.x = element_blank(),         
    panel.grid.minor.x = element_blank(),         
    panel.grid.minor = element_blank()            
)

Code
# Date range of median_housing tibble
min_date <- min(median_housing$date)
max_date <- max(median_housing$date)

# Filter recession range with housing data
recessions_trimmed <- recessions |>
  mutate(
    in_range = (end >= min_date & 
               date <= max_date)
  ) |>
  filter(in_range)  # Filter for overlapping recessions
Code
#| label: Plot median_housing w/Recession Shading

ggplot(median_housing, aes(x = date, y = price)) +
  
  # Shaded recession dates
  geom_rect(
    data = recessions_trimmed,
    inherit.aes = FALSE,
    aes(xmin = date, xmax = end, ymin = -Inf, ymax = Inf),
    fill = "gray",
    #alpha = 0.6
  ) +
  
  geom_line(color = "blue", linewidth = 0.8) +
  # Scale x
  scale_x_date(
    date_labels = "%Y",   # Show years only
    date_breaks = "5 years"
  ) +
  # Scale y
    scale_y_continuous(
    limits = c(0, 400000),                            
    breaks = seq(0, 400000, by = 40000),
    labels = scales::label_comma() # call specific package
    #coord_cartesian(ylim = c(0, 440000)),
    )+
  
  labs(
    title = "Median sales price of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Sources: Census; HUD"
  ) +
  # Formating
  theme(
    # Labels
    plot.title.position = "plot",
    plot.title = element_text(size = 14, hjust = 0),
    plot.subtitle = element_text(size = 12,hjust = 0),
    plot.caption = element_text(size = 10, hjust = 1),
    # Text size
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 12),
    axis.text.x = element_text(size = 9),
    axis.text.y = element_text(size = 9),
    # Hide tick marks
    axis.ticks.x = element_blank(),             
    panel.grid.major.x = element_blank(),         
    panel.grid.minor.x = element_blank(),         
    panel.grid.minor = element_blank()            
)

Code
#| label: median_housing by quarter

# Create subset

housing_quarters <- median_housing |>
  filter(year(date) %in% c(2019, 2020)) |>
  mutate(
    year = year(date),
    quarter = paste0("Q", quarter(date))
  )
Code
#| label: median_housing quartly plot

# Create the plot
ggplot(housing_quarters, aes(x = factor(paste(year(date), quarter(date), sep = " Q")), y = price)) +
  geom_line(group = 1, color = "blue", linewidth = 1) +
  geom_point(shape = 21, size = 2, fill = "white", color = "blue", stroke = 1) +
  # Used ChatGPT for this line. I had trouble with the x axis. 
  scale_x_discrete(
    limits = c("Q1", "Q2", "Q3", "Q4",
               "Q1", "Q2", "Q3", "Q4")
  ) +
  # Main line and points
  geom_line(color = "blue", linewidth = 1
  ) +
  geom_point(size = 2, shape = 21, fill = "white", color = "blue", stroke = 1
)+
  # x axis tick labels
  scale_x_discrete(
    labels = housing_quarters$quarter,
    expand = c(0, 0) # no "padding" at limits. Used ChatGPT for this. This drove me nuts.
  )+
  # y axis scale
  scale_y_continuous(
    limits = c(300000, 360000),
    breaks = seq(300000, 360000, 20000),
    minor_breaks = seq(310000, 350000, 20000),
    labels = label_comma(),
    expand = c(0, 0) # no "padding" at limits. Used ChatGPT for this. This drove me nuts.
  ) +
  
  # Zoom 
  coord_cartesian(
    ylim = c(300000, 360000),
    clip = "off"
  ) +
  
  labs(
    title = "Median sales price of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
   # x = "2019                                          2020",
    
    x = glue::glue("2019{strrep(' ', 70)}2020"), 
   #Used ChatGPT for this clever annotation. I originally used x = "2019 ....... 2020" to try and center the labels. I think there might be a better way, but I couln't figure it out. 
    y = "Dollars"
  ) +

  theme(
    plot.title.position = "plot",
    plot.title = element_text(hjust = 0),
    plot.subtitle = element_text(hjust = 0),
    axis.text.x = element_text(size = 10),
    axis.title.x = element_text(size = 12)
  
  )

4 - Expect More. Plot More.

Code
# "Target Red" #E80018
# Source: https://www.schemecolor.com/target-red-logo-color.php

# Create and grid for image 
# Center at (0, 0)
target_grid <- expand.grid(
  x = -1000:1000,
  y = -1000:1000
) |>
  mutate(
    r = sqrt(x^2 + y^2),  # distance from center
    fill = case_when(
      r <= 132 ~ "#E80018",         # inner dot
      r <= 340 ~ "white",            # middle ring
      r <= 532 ~ "#E80018",         # outer ring
      TRUE  ~ "white"               # background
    )
  )

# Create plot
ggplot(target_grid, aes(x, y, fill = fill)) +
  geom_tile() +
  coord_fixed() +
  scale_fill_identity() +
  theme_void() +

  annotate(
    "richtext",
    x = 0, y = -750,
    label = "TARGET<sub>®</sub>",  # HTML subscript
    size = 7,
    fill = NA, 
    label.color = NA,# Remove box
    color = "#E80018",
    fontface = "bold"
    
  ) 

First I searched for the “target red”, then I brainstormed the possible techniques I could use and googled, “create image in tibble”. I essentially had the idea that I wanted to project the logo similar to how a TV works. The obvious issue is storing the data in a structure. I tried to implement trig functions and quickly realized it was getting way too complex. I settled on creating a circle using the formula centered at orgin and then fill. The first issue was resolution. I had to play with the scale to get a smooth edge. I had some trouble finding the proportions so I measured by hand and just used a ratio to scale the rings. The last problem to solve was the text and subscript. I used HTML subscript because all other methods for using subscript for the registered trademark would render incorrectly.

5 - Mirror, mirror on the wall, who’s the ugliest of them all?

Code
#| label: Standard penguin plot example

# Standard penguins plot
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point(size = 2, alpha = 0.8) +
  labs(
    title = "Penguin body mass vs. flipper length",
    x = "Flipper length (mm)",
    y = "Body mass (g)",
    color = "Species"
  )

Code
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point(
    size = 1,
    shape = 21,
    fill = "limegreen",
    stroke = 1
  ) +
  labs(
    title = "Penguins",
    subtitle = "body mass flippers",
    x = "length of FLIPPER",
    caption = "Plot? Yes. Ugly? Also Yes",
    color = "Species",
  ) +
  scale_color_manual(values = c("magenta", "orange", "chartreuse")) +
  theme(
    panel.background = element_rect(fill = "pink", color = "red", size = 2),
    plot.background = element_rect(fill = "yellow"),
    legend.background = element_rect(fill = "cyan", color = "black"),
    legend.position = "top",
    plot.title = element_text(family = "Comic Sans MS", face = "bold", size = 14, color = "purple"),
    plot.subtitle = element_text(size = 15, face = "italic", color = "firebrick"),
    axis.title.x = element_text(size = 10, color = "darkgreen", angle = 7),
    axis.title.y = element_text(size = 10, color = "darkblue", angle = 2),
    axis.text = element_text(size = 5, color = "brown"),
    panel.grid.major = element_line(color = "black", linetype = "dotted", linewidth = 1),
    panel.grid.minor = element_line(color = "red", linetype = "dotdash"),
    legend.text = element_text(size = 5, face = "bold.italic", color = "blue")
  )